/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.db; import java.io.*; import java.util.*; import net.nutch.io.*; import net.nutch.util.*; /********************************************************* * The EditSectionGroupWriter maintains a set of EditSectionWriter * objects. It chooses the appropriate EditSectionWriter to * carry out each operation. * * @author Mike Cafarella *********************************************************/ public class EditSectionGroupWriter { final static int CUR_VERSION = 0; // File for SectionGroupWriter meta-info public final static String GROUP_METAINFO = "group_metainfo"; // // Keyspace identifiers // public static int URL_KEYSPACE = 0; public static int MD5_KEYSPACE = 1; /********************************************************* * Edit instructions are Comparable, but they also have * an "inner" key like MD5Hash or URL that is also Comparable. * This class extracts that inner key, which we need for * allocating a Page or Link Instruction into the correct * bucket. *********************************************************/ public static abstract class KeyExtractor { /** */ public KeyExtractor() { } /** */ public abstract WritableComparable extractInnerKey(WritableComparable key); } /** * Get the URL from a PageInstruction */ public static class PageURLExtractor extends KeyExtractor { public PageURLExtractor() { } public WritableComparable extractInnerKey(WritableComparable key) { return ((DistributedWebDBWriter.PageInstruction) key).getPage().getURL(); } } /** * Get the MD5 from a PageInstruction */ public static class PageMD5Extractor extends KeyExtractor { public PageMD5Extractor() { } public WritableComparable extractInnerKey(WritableComparable key) { return ((DistributedWebDBWriter.PageInstruction) key).getPage().getMD5(); } } /** * Get the URL from a LinkInstruction */ public static class LinkURLExtractor extends KeyExtractor { public LinkURLExtractor() { } public WritableComparable extractInnerKey(WritableComparable key) { return ((DistributedWebDBWriter.LinkInstruction) key).getLink().getURL(); } } /** * Get the MD5 from a LinkInstruction */ public static class LinkMD5Extractor extends KeyExtractor { public LinkMD5Extractor() { } public WritableComparable extractInnerKey(WritableComparable key) { return ((DistributedWebDBWriter.LinkInstruction) key).getLink().getFromID(); } } /** * Initialize an EditSectionGroup. Tell it the label, the * keytype, and the division between keys. */ public static void createEditGroup(NutchFileSystem nutchfs, String dbName, String label, int numSections, int keySpaceType) throws IOException { // Max num-sections if (numSections > DBKeyDivision.MAX_SECTIONS) { throw new IllegalArgumentException("Maximum number of sections is " + DBKeyDivision.MAX_SECTIONS); } // Test for known keyspace type. if ((keySpaceType != URL_KEYSPACE) && (keySpaceType != MD5_KEYSPACE)) { throw new IllegalArgumentException("Unknown keyspace type: " + keySpaceType); } NutchFile metaInfoDir = new NutchFile(nutchfs, dbName, "standard", new File(GROUP_METAINFO)); NutchFile metaInfo = new NutchFile(metaInfoDir, label); File metaInfoFile = nutchfs.getWorkingFile(); DataOutputStream out = new DataOutputStream(new FileOutputStream(metaInfoFile)); try { out.write(CUR_VERSION); out.writeInt(keySpaceType); double stepSize = DBKeyDivision.MAX_SECTIONS / (1.0 * numSections); if (keySpaceType == URL_KEYSPACE) { UTF8 url = new UTF8(); for (int i = 0; i < numSections; i++) { url.set(DBKeyDivision.URL_KEYSPACE_DIVIDERS[(int) Math.round(i * stepSize)]); url.write(out); } } else { for (int i = 0; i < numSections; i++) { DBKeyDivision.MD5_KEYSPACE_DIVIDERS[(int) Math.round(i * stepSize)].write(out); } } } finally { out.close(); } nutchfs.put(metaInfo, metaInfoFile, true); } int machineNum = -1, totalMachines = 1; KeyExtractor extractor; String label; WritableComparable sectionKeys[]; EditSectionWriter sectionWriters[]; /** * Start a EditSectionGroupWriter at the indicated location, for * a single emitter. There will be as many of these as there * are processor-machines. (The emitter value will be different * for each.) The Group must already have been created via a * call to EditSectionGroupWriter.createEditSectionGroupWriter(). * * The EditSectionGroupWriter consists of a bunch of * EditSectionWriters, each of which hold a file we append to. */ public EditSectionGroupWriter(NutchFileSystem nutchfs, String dbName, int machineNum, int totalMachines, String label, Class keyClass, Class valClass, EditSectionGroupWriter.KeyExtractor extractor) throws IOException { this.machineNum = machineNum; this.totalMachines = totalMachines; this.extractor = extractor; // Bail if the emitter/section numbering is incorrect if (machineNum < 0 || machineNum >= totalMachines) { throw new IllegalArgumentException("machineNum is " + machineNum + ", and totalMachines is " + totalMachines); } // Load in details about keys NutchFile metaInfoDir = new NutchFile(nutchfs, dbName, "standard", new File(GROUP_METAINFO)); NutchFile metaInfo = new NutchFile(metaInfoDir, label); File metaInfoFile = nutchfs.get(metaInfo); DataInputStream in = new DataInputStream(new FileInputStream(metaInfoFile)); try { int version = in.read(); int keySpaceType = in.readInt(); this.sectionKeys = new WritableComparable[totalMachines]; for (int i = 0; i < sectionKeys.length; i++) { WritableComparable key = null; if (keySpaceType == URL_KEYSPACE) { key = new UTF8(); } else { key = new MD5Hash(); } key.readFields(in); this.sectionKeys[i] = key; } } finally { in.close(); } // Build all the sections this.sectionWriters = new EditSectionWriter[totalMachines]; for (int i = 0; i < sectionWriters.length; i++) { this.sectionWriters[i] = new EditSectionWriter(nutchfs, dbName, label, i, machineNum, keyClass, valClass); } this.label = label; } /** * Add an instruction and append it. We need to find an * appropriate EditSectionWriter. */ public void append(WritableComparable key, Writable val) throws IOException { WritableComparable innerKey = extractor.extractInnerKey(key); // If there is noplace to write the item, return if (sectionWriters.length == 0) { return; } // (start, end] is the range int start = 0, end = sectionWriters.length, pivot = 0; while (end - start > 1) { pivot = (end + start) / 2; int comparison = innerKey.compareTo(sectionKeys[pivot]); if (comparison < 0) { end = pivot; } else if (comparison >= 0) { start = pivot; } } sectionWriters[start].append(key, val); } /** * Close down the writers */ public void close() throws IOException { for (int i = 0; i < sectionWriters.length; i++) { sectionWriters[i].close(); } } }